Given the results of the analysis of the Fluidigm data, here I explore the role of the V intercept in a more complex data (Allen).
We want to check if the intercept acts as a size factor here too and, if true, if W capture some interesting biology.
Interestingly, in this more complex dataset, both intercepts are needed to get a clear picture of the data in two dimensions.
As a first pass, I will only focus on the cells that passed the QC filters (by the original authors) and that were part of the “core” clusters. I will color-code the cells by either known cell type or by inferred cluster (inferred in the original study).
Select cells, remove ERCC spike-ins, filter out the genes that do not have at least 10 counts in at least 10 cells.
data("allen")
allen_core <- allen[grep("^ERCC-", rownames(allen), invert = TRUE),
which(colData(allen)$Core.Type=="Core")]
filter <- apply(assay(allen_core)>10, 1, sum)>=10
Number of retained genes:
print(sum(filter))
## [1] 11545
To speed up the computations, I will focus on the top 1,000 most variable genes.
allen_core <- allen_core[filter,]
core <- assay(allen_core)
vars <- rowVars(log1p(core))
names(vars) <- rownames(core)
vars <- sort(vars, decreasing = TRUE)
core <- core[names(vars)[1:1000],]
First, let’s look at PCA (of the log counts) for reference.
par(mfrow=c(1, 2))
bio <- as.factor(colData(allen_core)$driver_1_s)
cl <- as.factor(colData(allen_core)$Primary.Type)
detection_rate <- colSums(core>0)
coverage <- colSums(core)
pca <- prcomp(t(log1p(core)))
plot(pca$x, col=cols[bio], pch=19, main="PCA of log-counts, centered not scaled")
legend("bottomleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19, main="PCA of log-counts, centered not scaled")
df <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## PC1 PC2 detection_rate coverage
## PC1 1.00000000 -0.01052424 0.8066812 0.4453172
## PC2 -0.01052424 1.00000000 -0.3601716 -0.3286485
## detection_rate 0.80668124 -0.36017158 1.0000000 0.5275845
## coverage 0.44531717 -0.32864852 0.5275845 1.0000000
print(system.time(zinb_Vall <- zinbFit(core, ncores = 3, K = 2)))
## user system elapsed
## 106.854 30.021 64.516
Plot the results with cells colored according to their biological condition.
par(mfrow=c(1, 2))
plot(zinb_Vall@W, col=cols[bio], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottomright", levels(bio), fill=cols)
plot(zinb_Vall@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("topright", levels(cl), fill=cols2)
One interpretation of the model is that \(\gamma_mu\) will act as a “size factor”. Here, we check whether it captures sequencing depth and/or detection rate.
#total number of detected genes in the cell
df <- data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2], gamma_mu = zinb_Vall@gamma_mu[1,], gamma_pi = zinb_Vall@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## W1 W2 gamma_mu gamma_pi
## W1 1.00000000 0.03514491 -0.0843733 0.3598978
## W2 0.03514491 1.00000000 -0.3676352 0.3710653
## gamma_mu -0.08437330 -0.36763518 1.0000000 -0.3348686
## gamma_pi 0.35989777 0.37106533 -0.3348686 1.0000000
## detection_rate -0.35188976 -0.34026613 0.3712522 -0.9939701
## coverage -0.05250198 -0.05649406 0.8504471 -0.4846195
## detection_rate coverage
## W1 -0.3518898 -0.05250198
## W2 -0.3402661 -0.05649406
## gamma_mu 0.3712522 0.85044711
## gamma_pi -0.9939701 -0.48461953
## detection_rate 1.0000000 0.52758451
## coverage 0.5275845 1.00000000
\(\gamma_pi\) clearly captures the detection rate and \(\gamma_mu\) clearly captures the coverage. Interestingly, the two estimates are positively correlated.
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vall)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])
plot(rowMeans(log1p(getMu(zinb_Vall))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])
It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).
par(mfrow=c(2, 2))
boxplot(zinb_Vall@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vall@beta_pi[1,], main="Beta_Pi")
plot(t(zinb_Vall@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vall@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")
print(system.time(zinb_Vnone <- zinbFit(core, ncores = 3, K = 2, V=matrix(0, ncol=1, nrow=NROW(core)))))
## user system elapsed
## 120.871 30.694 59.448
par(mfrow=c(1, 2))
plot(zinb_Vnone@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_Vnone@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
df <- data.frame(W1 = zinb_Vnone@W[,1], W2 = zinb_Vnone@W[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## W1 W2 detection_rate coverage
## W1 1.000000000 0.005426967 -0.4105128 -0.1328474
## W2 0.005426967 1.000000000 0.8736250 0.5865832
## detection_rate -0.410512821 0.873624999 1.0000000 0.5275845
## coverage -0.132847434 0.586583172 0.5275845 1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vnone)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])
plot(rowMeans(log1p(getMu(zinb_Vnone))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])
It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).
par(mfrow=c(2, 2))
boxplot(zinb_Vnone@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vnone@beta_pi[1,], main="Beta_Pi")
plot(t(zinb_Vnone@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vnone@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")
V <- cbind(rep(0, NROW(core)), rep(1, NROW(core)))
print(system.time(zinb_Vmu <- zinbFit(core, ncores = 3, K = 2, V=V, which_V_mu=2L, which_V_pi=1L)))
## user system elapsed
## 117.983 30.208 62.073
par(mfrow=c(1, 2))
plot(zinb_Vmu@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
plot(zinb_Vmu@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
df <- data.frame(W1 = zinb_Vmu@W[,1], W2 = zinb_Vmu@W[,2], gamma_mu = zinb_Vmu@gamma_mu[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## W1 W2 gamma_mu detection_rate coverage
## W1 1.00000000 -0.06020466 0.0485493 -0.2663277 -0.1083201
## W2 -0.06020466 1.00000000 0.5593379 0.8898885 0.3772103
## gamma_mu 0.04854930 0.55933792 1.0000000 0.5880261 0.9235989
## detection_rate -0.26632771 0.88988849 0.5880261 1.0000000 0.5275845
## coverage -0.10832007 0.37721026 0.9235989 0.5275845 1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vmu)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])
plot(rowMeans(log1p(getMu(zinb_Vmu))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])
It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).
par(mfrow=c(2, 2))
boxplot(zinb_Vmu@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vmu@beta_pi[1,], main="Beta_Pi")
plot(t(zinb_Vmu@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vmu@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")
print(system.time(zinb_Vpi <- zinbFit(core, ncores = 3, K = 2, V=V, which_V_mu=1L, which_V_pi=2L)))
## user system elapsed
## 152.040 40.292 80.801
par(mfrow=c(1, 2))
plot(zinb_Vpi@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
plot(zinb_Vpi@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
df <- data.frame(W1 = zinb_Vpi@W[,1], W2 = zinb_Vpi@W[,2], gamma_pi = zinb_Vpi@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## W1 W2 gamma_pi detection_rate
## W1 1.00000000 0.1305085 0.09593638 -0.09884142
## W2 0.13050848 1.0000000 -0.37656124 0.27518468
## gamma_pi 0.09593638 -0.3765612 1.00000000 -0.98667190
## detection_rate -0.09884142 0.2751847 -0.98667190 1.00000000
## coverage 0.15676103 -0.2568486 -0.47896136 0.52758451
## coverage
## W1 0.1567610
## W2 -0.2568486
## gamma_pi -0.4789614
## detection_rate 0.5275845
## coverage 1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vpi)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])
plot(rowMeans(log1p(getMu(zinb_Vpi))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])
It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).
par(mfrow=c(2, 2))
boxplot(zinb_Vpi@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vpi@beta_pi[1,], main="Beta_Pi")
plot(t(zinb_Vpi@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vpi@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")
print(system.time(zinb_3 <- zinbFit(core, ncores = 3, K = 3)))
## user system elapsed
## 121.030 32.532 61.484
pairs(zinb_3@W, col=cols[bio], pch=19)
pairs(zinb_3@W, col=cols2[cl], pch=19)
## write matrices to file to feed to ZIFA in python
write.csv(log1p(core), file="allen.csv")
core <- assay(allen_core)
dim(core)
## [1] 11545 285
print(system.time(zinb_all <- zinbFit(core, ncores = 3, K = 2)))
## user system elapsed
## 1162.914 161.906 572.368
par(mfrow=c(1, 2))
plot(zinb_all@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
plot(zinb_all@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
Interestingly, when using all the genes, things don’t work nicely anymore. I wonder if the difference is the selection of the most variable genes, rather than the complexity of the data.
import sys
sys.path.insert(1, '/usr/local/lib/python2.7/site-packages/')
from pandas import read_csv
from ZIFA import ZIFA
import numpy as np
df = read_csv('allen.csv')
del df['Unnamed: 0']
lc = np.array(df)
Y = np.transpose(lc)
Z, model_params = ZIFA.fitModel(Y, 2)
np.savetxt('allen_zifa.csv', Z, delimiter=',')
## Running zero-inflated factor analysis with N = 285, D = 1000, K = 2
## Param change below threshold 1.000e-02 after 6 iterations
par(mfrow=c(1, 2))
zifa_res <- read.csv("allen_zifa.csv", header=FALSE)
plot(zifa_res, col=cols[bio], pch=19, xlab="Z1", ylab="Z2")
plot(zifa_res, col=cols2[cl], pch=19, xlab="Z1", ylab="Z2")
wrapRzifa <- function(Y, block = F){
d = digest(Y, "md5")
tmp = paste0(tempdir(), '/', d)
write.csv(Y, paste0(tmp, '.csv'))
if (!file.exists(paste0(tmp, '_zifa.csv'))){
print('run ZIFA')
bb = ifelse(block, '-b ', '')
cmd = sprintf('python run_zifa.py %s%s.csv %s_zifa.csv', bb, tmp, tmp)
system(cmd)
}
read.csv(sprintf("%s_zifa.csv", tmp), header=FALSE)
}
Y = log2(core + 1)
zifa_res = wrapRzifa(Y)
plot(zifa_res, col=cols[bio], pch=19, xlab="Z1", ylab="Z2")
plot(zifa_res, col=cols2[cl], pch=19, xlab="Z1", ylab="Z2")
df <- data.frame(Z1 = zifa_res$V1,
Z2 = zifa_res$V2,
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## Z1 Z2 detection_rate coverage
## Z1 1.00000000 0.07493093 -0.3890703 -0.2343373
## Z2 0.07493093 1.00000000 0.6859741 0.4423235
## detection_rate -0.38907034 0.68597407 1.0000000 0.5275845
## coverage -0.23433727 0.44232350 0.5275845 1.0000000
df <- data.frame(W1 = zinb_Vall@W[,1],
W2 = zinb_Vall@W[,2],
Z1 = zifa_res$V1,
Z2 = zifa_res$V2)
pairs(df, pch=19, col=cols[bio], main = 'Intercept in both Mu and Pi')
print(cor(df, method="spearman"))
## W1 W2 Z1 Z2
## W1 1.00000000 0.03514491 0.87737303 -0.06011083
## W2 0.03514491 1.00000000 -0.27471165 -0.77121359
## Z1 0.87737303 -0.27471165 1.00000000 0.07493093
## Z2 -0.06011083 -0.77121359 0.07493093 1.00000000
df <- data.frame(W1 = zinb_Vnone@W[,1],
W2 = zinb_Vnone@W[,2],
Z1 = zifa_res$V1,
Z2 = zifa_res$V2)
pairs(df, pch=19, col=cols[bio], main = 'No intercept')
print(cor(df, method="spearman"))
## W1 W2 Z1 Z2
## W1 1.000000000 0.005426967 0.940024468 0.11583976
## W2 0.005426967 1.000000000 -0.008162482 0.88366985
## Z1 0.940024468 -0.008162482 1.000000000 0.07493093
## Z2 0.115839757 0.883669851 0.074930925 1.00000000
df <- data.frame(W1 = zinb_Vmu@W[,1],
W2 = zinb_Vmu@W[,2],
Z1 = zifa_res$V1,
Z2 = zifa_res$V2)
pairs(df, pch=19, col=cols[bio], main = 'Intercept in Mu')
print(cor(df, method="spearman"))
## W1 W2 Z1 Z2
## W1 1.00000000 -0.06020466 0.93880730 0.27101971
## W2 -0.06020466 1.00000000 -0.21813102 0.85510842
## Z1 0.93880730 -0.21813102 1.00000000 0.07493093
## Z2 0.27101971 0.85510842 0.07493093 1.00000000
df <- data.frame(W1 = zinb_Vpi@W[,1],
W2 = zinb_Vpi@W[,2],
Z1 = zifa_res$V1,
Z2 = zifa_res$V2)
pairs(df, pch=19, col=cols[bio], main = 'Intercept in Pi')
print(cor(df, method="spearman"))
## W1 W2 Z1 Z2
## W1 1.0000000 0.1305085 0.88529862 0.37835330
## W2 0.1305085 1.0000000 0.13259032 0.63760776
## Z1 0.8852986 0.1325903 1.00000000 0.07493093
## Z2 0.3783533 0.6376078 0.07493093 1.00000000